Just checking my ability to load the AHP data and see overall structure with a PCA
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(metamoRph)
proteome_df <- data.table::fread('../data/AhpDb_PsmMatrix.csv')
## Warning in data.table::fread("../data/AhpDb_PsmMatrix.csv"): Discarded
## single-line footer: <<(18869 rows affected)>>
proteome_df <- proteome_df[-1,] # remove visual formatting line
# make numeric
proteome_matrix <- proteome_df[,-1] %>% as.matrix()
mode(proteome_matrix) = "numeric"
## Warning in mde(x): NAs introduced by coercion
row.names(proteome_matrix) <- proteome_df$Accession
proteome_meta <- read_delim('../data/AhpDb_ProteinSummary.csv', delim = 'asdf')
## Rows: 1649 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "asdf"
## chr (1): UniProt_Id ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(proteome_meta) <- 'one'
# some wacky stuff to handle the ... interesting ... data format
proteome_meta <- proteome_meta %>%
mutate_if(is.character, utf8::utf8_encode) %>%
separate(one, into =
c("UniProt_Id", "Gene_Symbol", "Protein_Name", "Gene_Names", "Accession", "PsmSum", "MeanPsm", "PsmPresent", "PctPresent"),
sep = '\\s\\s+,|,\\s\\s+')
## Warning: Expected 9 pieces. Missing pieces filled with `NA` in 2 rows [1,
## 1649].
# first row is visual formatting
# last row is empty
proteome_meta <- proteome_meta[-c(1,1649),]
sample_meta <- read_csv('../data/AhpDb_ClinicalData.csv')
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 313 Columns: 62
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (62): SampleId, CD, Sex, Age, Race, Ethnicity, Iop, IopMethod, OdOs, Inc...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_meta <- sample_meta[-c(1,314),]
Should remove the handful of samples with a PSM sum less than 2000 and sum normalize (a la TPM)
proteome_matrix %>% as_tibble() %>% select(contains("AH")) %>% colSums(na.rm = TRUE) %>% density() %>% plot()
proteome_matrix %>% as_tibble() %>% select(contains("AH")) %>% colSums(na.rm = TRUE) %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 687 10093 11469 11783 13115 23364
remove_id <- proteome_matrix %>% as_tibble() %>% select(contains("AH")) %>% colSums(na.rm = TRUE) %>% enframe() %>% filter(value < 2000)
proteome_tib <- proteome_matrix %>% as_tibble(rownames = 'Accession') %>% select(-contains(remove_id$name))
Eight samples are REALLY different than the rest
proteome_mat_clean <- proteome_tib %>% select(contains("AH")) %>% as.matrix()
row.names(proteome_mat_clean) <- proteome_tib$Accession
aligned_sample_meta <- colnames(proteome_mat_clean) %>% enframe(value = 'SampleId') %>% select(-name) %>%
left_join(sample_meta)
## Joining with `by = join_by(SampleId)`
proteome_mat_clean[is.na(proteome_mat_clean)] <- 0
pca_mm <- run_pca(proteome_mat_clean,aligned_sample_meta)
## Sample CPM scaling
## log1p scaling
## prcomp start
plot <- pca_mm$PCA$x %>% as_tibble(rownames = 'SampleId') %>%
left_join(pca_mm$meta, by = 'SampleId') %>%
ggplot(aes(x=PC1,y=PC2,color = CD, label = SampleId)) +
geom_point() +
scale_color_manual(values = pals::alphabet() %>% unname()) +
cowplot::theme_cowplot()
plotly::ggplotly(plot)
Removing those eight samples and AH602 (top left) and redoing the PCA
Looks reasonable now
proteome_mat_clean_filter <- proteome_mat_clean[,pca_mm$PCA$x[,c('PC1','PC2')] %>% as_tibble(rownames = 'sample') %>% filter(PC1 < 20, PC2 < 20) %>% pull(sample)]
aligned_sample_meta <- colnames(proteome_mat_clean_filter) %>% enframe(value = 'SampleId') %>% select(-name) %>%
left_join(sample_meta)
## Joining with `by = join_by(SampleId)`
pca_mm <- run_pca(proteome_mat_clean_filter,aligned_sample_meta)
## Sample CPM scaling
## log1p scaling
## prcomp start
plot <- pca_mm$PCA$x %>% as_tibble(rownames = 'SampleId') %>%
left_join(pca_mm$meta, by = 'SampleId') %>%
ggplot(aes(x=PC1,y=PC2,color = CD, label = SampleId)) +
geom_point() +
scale_color_manual(values = pals::alphabet() %>% unname()) +
cowplot::theme_cowplot()
plotly::ggplotly(plot)
save(proteome_mat_clean_filter, aligned_sample_meta, file = '../data/ahpdb_cleaned_data.Rdata')
sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] metamoRph_0.2.0 lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0
## [5] dplyr_1.1.2 purrr_1.0.1 readr_2.1.4 tidyr_1.3.0
## [9] tibble_3.2.1 ggplot2_3.4.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 rlang_1.1.1
## [3] magrittr_2.0.3 matrixStats_1.0.0
## [5] compiler_4.3.0 DelayedMatrixStats_1.21.0
## [7] vctrs_0.6.3 maps_3.4.1
## [9] pkgconfig_2.0.3 crayon_1.5.2
## [11] fastmap_1.1.1 ellipsis_0.3.2
## [13] XVector_0.40.0 labeling_0.4.2
## [15] scuttle_1.9.4 utf8_1.2.3
## [17] rmarkdown_2.23 tzdb_0.4.0
## [19] bit_4.0.5 xfun_0.40
## [21] bluster_1.9.1 zlibbioc_1.46.0
## [23] cachem_1.0.8 pals_1.7
## [25] beachmat_2.15.0 GenomeInfoDb_1.36.1
## [27] jsonlite_1.8.7 highr_0.10
## [29] DelayedArray_0.26.7 BiocParallel_1.33.11
## [31] irlba_2.3.5.1 parallel_4.3.0
## [33] cluster_2.1.4 R6_2.5.1
## [35] bslib_0.5.0 stringi_1.7.12
## [37] limma_3.56.1 GenomicRanges_1.52.0
## [39] jquerylib_0.1.4 Rcpp_1.0.11
## [41] SummarizedExperiment_1.30.2 knitr_1.43
## [43] IRanges_2.34.1 Matrix_1.5-4.1
## [45] igraph_1.4.3 timechange_0.2.0
## [47] tidyselect_1.2.0 rstudioapi_0.14
## [49] dichromat_2.0-0.1 abind_1.4-5
## [51] yaml_2.3.7 codetools_0.2-19
## [53] lattice_0.21-8 Biobase_2.60.0
## [55] withr_2.5.0 evaluate_0.21
## [57] pillar_1.9.0 MatrixGenerics_1.12.3
## [59] stats4_4.3.0 plotly_4.10.1
## [61] generics_0.1.3 vroom_1.6.3
## [63] RCurl_1.98-1.12 S4Vectors_0.38.1
## [65] hms_1.1.3 sparseMatrixStats_1.11.1
## [67] munsell_0.5.0 scales_1.2.1
## [69] glue_1.6.2 metapod_1.7.0
## [71] mapproj_1.2.11 lazyeval_0.2.2
## [73] tools_4.3.0 BiocNeighbors_1.17.1
## [75] data.table_1.14.8 ScaledMatrix_1.8.1
## [77] locfit_1.5-9.7 scran_1.27.1
## [79] cowplot_1.1.1 grid_4.3.0
## [81] crosstalk_1.2.0 edgeR_3.42.2
## [83] colorspace_2.1-0 SingleCellExperiment_1.22.0
## [85] GenomeInfoDbData_1.2.10 BiocSingular_1.15.0
## [87] cli_3.6.1 rsvd_1.0.5
## [89] fansi_1.0.4 viridisLite_0.4.2
## [91] S4Arrays_1.0.5 gtable_0.3.3
## [93] sass_0.4.7 digest_0.6.33
## [95] BiocGenerics_0.46.0 dqrng_0.3.0
## [97] htmlwidgets_1.6.2 htmltools_0.5.5
## [99] lifecycle_1.0.3 httr_1.4.6
## [101] statmod_1.5.0 bit64_4.0.5